Basically, this is a way to perform semi-supervised learning with an unreasonably complicated sequence of processing stages. First, we scale the data. Then, we transform the data into a sparse binary representation with a totally random tree embedding. Then, we use a restricted Boltzmann machine to extract a features from this representation.
Frankly, I'll be amazed if it works.
In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
plt.rcParams['figure.figsize'] = 8, 12
plt.rcParams['axes.grid'] = True
plt.set_cmap('brg')
In [2]:
cd ..
In [3]:
from python import utils
In [7]:
with open("settings/forestselection_gavin.json") as fh:
settings = utils.json.load(fh)
In [8]:
with open("segmentMetadata.json") as fh:
meta = utils.json.load(fh)
In [9]:
data = utils.get_data(settings)
In [10]:
da = utils.DataAssembler(settings,data,meta)
Then we just need to build training sets for each subject and apply the relevant models. Unfortunately, the cross-validator doesn't handle test segments so we won't be able to run any informative cross-validation.
In [13]:
import sklearn.ensemble
import sklearn.preprocessing
import sklearn.neural_network
import sklearn.svm
In [38]:
scaler = sklearn.preprocessing.StandardScaler()
embedding = sklearn.ensemble.RandomTreesEmbedding(n_estimators=1000)
rbm = sklearn.neural_network.BernoulliRBM(n_components=500)
classifier = sklearn.svm.SVC(kernel='linear',probability=True)
In [39]:
%%time
predictions = {}
for subject in settings['SUBJECTS']:
print("Processing " +subject)
Xtrain,ytrain = da.build_training(subject)
Xtest = da.build_test(subject)
X = np.vstack([Xtrain,Xtest])
print("Applying scaling.")
# then we want to fit preprocess the data
X = scaler.fit_transform(X)
print("Shape of data: {0}".format(X.shape))
print("Applying Tree embedding.")
X = embedding.fit_transform(X)
print("Shape of data: {0}".format(X.shape))
print("Applying RBM transformation.")
X = rbm.fit_transform(X)
print("Shape of data: {0}".format(X.shape))
#slice Xtrain and Xtest back off of X
Xtrain = X[:Xtrain.shape[0],:]
Xtest = X[Xtrain.shape[0]:,:]
print("Fitting classifier.")
# then fit the classifier
classifier.fit(Xtrain,ytrain)
print("Classifying test data.")
# then classify the test set
predictions[subject] = np.hstack([da.test_segments[np.newaxis].T,\
classifier.predict_proba(Xtest)[:,1][np.newaxis].T])
Writing this to a file for submission:
In [34]:
import csv
In [35]:
with open("output/svc_tree_embedded_rbm_transform_pg.csv","w") as f:
c = csv.writer(f)
c.writerow(['clip','preictal'])
for subject in settings['SUBJECTS']:
for line in predictions[subject]:
c.writerow(line)
In [36]:
!wc output/svc_tree_embedded_rbm_transform_pg.csv
In [37]:
!head output/svc_tree_embedded_rbm_transform_pg.csv